Skip to content

Comments

swev-id: scikit-learn__scikit-learn-13135 - Sort k-means bin edges#24

Open
casey-brooks wants to merge 1 commit intoscikit-learn__scikit-learn-13135from
fix-kmeans-bin-edges-sorted-358
Open

swev-id: scikit-learn__scikit-learn-13135 - Sort k-means bin edges#24
casey-brooks wants to merge 1 commit intoscikit-learn__scikit-learn-13135from
fix-kmeans-bin-edges-sorted-358

Conversation

@casey-brooks
Copy link

Issue

Fixes #23

Reproduction

import numpy as np
from sklearn.preprocessing import KBinsDiscretizer
X = np.array([0, 0.5, 2, 3, 9, 10]).reshape(-1, 1)
est = KBinsDiscretizer(n_bins=5, strategy='kmeans', encode='ordinal')
Xt = est.fit_transform(X)

Observed failure (pre-fix):

Traceback (most recent call last):
  File "<stdin>", line 24, in <module>
  File "/root/.local/lib/python3.11/site-packages/sklearn/utils/_set_output.py", line 157, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.local/lib/python3.11/site-packages/sklearn/base.py", line 916, in fit_transform
    return self.fit(X, **fit_params).transform(X)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.local/lib/python3.11/site-packages/sklearn/utils/_set_output.py", line 157, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/scikit-learn/sklearn/preprocessing/_discretization.py", line 255, in transform
    Xt[:, jj] = np.digitize(Xt[:, jj] + eps, bin_edges[jj][1:])
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.local/lib/python3.11/site-packages/numpy/lib/function_base.py", line 5725, in digitize
    raise ValueError("bins must be monotonically increasing or decreasing")
ValueError: bins must be monotonically increasing or decreasing

Fix

  • Sort the 1D k-means cluster centers before deriving interior midpoints
  • Rebuild bin_edges_ from [col_min, interior midpoints, col_max] to guarantee monotonic edges for strategy='kmeans'
  • Add regression coverage for the 1D failure and for monotonic bin edges across multi-feature inputs

Tests

  • `PATH=/root/.local/bin:$PATH LD_LIBRARY_PATH=/root/.local/lib/python3.11/site-packages/scikit_learn.libs:/nix/store/qipd93x9gjyiygqk673rd2ssnf8y7jj0-gcc-14.3.0-lib/lib:/nix/store/gh2dd8vimringn726ndall19gbm77prj-openblas-0.3.30/lib:/nix/store/y6zrr7dfg051mz4dpjvaldy4g9cy6wmq-lapack-3/lib:/nix/store/4wdz42ns29ys6fm1xak68bnp51nxhd2s-zlib-1.3.1/lib PYTHONPATH=/root/.local/lib/python3.11/site-packages python3 - <<'PY'
    import numpy as np
    np.int = int
    import importlib.util
    import importlib
    import sys
    import tempfile
    from pathlib import Path

site_packages = '/root/.local/lib/python3.11/site-packages'
repo_root = '/workspace/scikit-learn'

sys.path.insert(0, site_packages)

import sklearn
import sklearn.preprocessing

sys.modules.pop('sklearn.preprocessing._discretization', None)

spec = importlib.util.spec_from_file_location(
'sklearn.preprocessing._discretization',
f'{repo_root}/sklearn/preprocessing/_discretization.py')
module = importlib.util.module_from_spec(spec)
sys.modules['sklearn.preprocessing._discretization'] = module
spec.loader.exec_module(module)

importlib.reload(sklearn.preprocessing)

sys.modules.pop('sklearn.utils._unittest_backport', None)
unittest_spec = importlib.util.spec_from_file_location(
'sklearn.utils._unittest_backport',
f'{repo_root}/sklearn/utils/_unittest_backport.py')
unittest_module = importlib.util.module_from_spec(unittest_spec)
sys.modules['sklearn.utils._unittest_backport'] = unittest_module
unittest_spec.loader.exec_module(unittest_module)

sys.modules.pop('sklearn.utils.testing', None)
testing_spec = importlib.util.spec_from_file_location(
'sklearn.utils.testing',
f'{repo_root}/sklearn/utils/testing.py')
testing_module = importlib.util.module_from_spec(testing_spec)
sys.modules['sklearn.utils.testing'] = testing_module
testing_spec.loader.exec_module(testing_module)

from pytest import main

with tempfile.TemporaryDirectory() as tmpdir:
test_path = Path(tmpdir) / 'test_discretization_local.py'
original = Path(repo_root) / 'sklearn' / 'preprocessing' / 'tests' / 'test_discretization.py'
test_path.write_text(original.read_text(encoding='utf-8'), encoding='utf-8')
sys.exit(main(['-q', str(test_path), '-k', 'kmeans']))
PY`

  • 10 passed, 26 deselected
  • PATH=/root/.local/bin:$PATH flake8 sklearn/preprocessing/_discretization.py sklearn/preprocessing/tests/test_discretization.py

@casey-brooks casey-brooks requested a review from a team December 26, 2025 00:48
@casey-brooks
Copy link
Author

Local Validation

  • PATH=/root/.local/bin:$PATH LD_LIBRARY_PATH=/root/.local/lib/python3.11/site-packages/scikit_learn.libs:/nix/store/qipd93x9gjyiygqk673rd2ssnf8y7jj0-gcc-14.3.0-lib/lib:/nix/store/gh2dd8vimringn726ndall19gbm77prj-openblas-0.3.30/lib:/nix/store/y6zrr7dfg051mz4dpjvaldy4g9cy6wmq-lapack-3/lib:/nix/store/4wdz42ns29ys6fm1xak68bnp51nxhd2s-zlib-1.3.1/lib PYTHONPATH=/root/.local/lib/python3.11/site-packages python3 - <<'PY'
    import numpy as np
    np.int = int
    import importlib.util
    import importlib
    import sys
    import tempfile
    from pathlib import Path

site_packages = '/root/.local/lib/python3.11/site-packages'
repo_root = '/workspace/scikit-learn'

sys.path.insert(0, site_packages)

import sklearn
import sklearn.preprocessing

sys.modules.pop('sklearn.preprocessing._discretization', None)

spec = importlib.util.spec_from_file_location(
'sklearn.preprocessing._discretization',
f'{repo_root}/sklearn/preprocessing/_discretization.py')
module = importlib.util.module_from_spec(spec)
sys.modules['sklearn.preprocessing._discretization'] = module
spec.loader.exec_module(module)

importlib.reload(sklearn.preprocessing)

sys.modules.pop('sklearn.utils._unittest_backport', None)
unittest_spec = importlib.util.spec_from_file_location(
'sklearn.utils._unittest_backport',
f'{repo_root}/sklearn/utils/_unittest_backport.py')
unittest_module = importlib.util.module_from_spec(unittest_spec)
sys.modules['sklearn.utils._unittest_backport'] = unittest_module
unittest_spec.loader.exec_module(unittest_module)

sys.modules.pop('sklearn.utils.testing', None)
testing_spec = importlib.util.spec_from_file_location(
'sklearn.utils.testing',
f'{repo_root}/sklearn/utils/testing.py')
testing_module = importlib.util.module_from_spec(testing_spec)
sys.modules['sklearn.utils.testing'] = testing_module
testing_spec.loader.exec_module(testing_module)

from pytest import main

with tempfile.TemporaryDirectory() as tmpdir:
test_path = Path(tmpdir) / 'test_discretization_local.py'
original = Path(repo_root) / 'sklearn' / 'preprocessing' / 'tests' / 'test_discretization.py'
test_path.write_text(original.read_text(encoding='utf-8'), encoding='utf-8')
sys.exit(main(['-q', str(test_path), '-k', 'kmeans']))
PY

  • 10 passed, 26 deselected
  • PATH=/root/.local/bin:$PATH flake8 sklearn/preprocessing/_discretization.py sklearn/preprocessing/tests/test_discretization.py
    • no issues

Copy link

@noa-lucent noa-lucent left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fix sorts k-means centers before constructing bin edges, ensuring monotonic bins. Added regression coverage for the failing case and multi-feature monotonicity; tests look solid.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants